import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from matplotlib.colors import rgb_to_hsv
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(font_scale=1.3)
train_raw = pd.read_csv('data/train.tsv',delimiter='\t')
test_raw = pd.read_csv('data/test.tsv',delimiter='\t')
train_raw.head()
test_raw.head()
We can see that the data contains multiple nested objects (Dictonaries & Lists, represented as strings).
These attributes will be parsed during feature extraction.
Let's see some statistics regarding our data:
train_raw.describe().apply(lambda s: s.apply(lambda x: format(x, 'g')))
test_raw.describe().apply(lambda s: s.apply(lambda x: format(x, 'g')))
We can see that both train and test has budget and runtime values of 0.
This is unlikely and thus will be considered as Missing value.
print(f"Amount of movies (train) with 0 budget: {(train_raw.budget == 0).sum()}")
print(f"Amount of movies (train) with 0 runtime: {(train_raw.runtime == 0).sum()}")
print(f"Amount of movies (test) with 0 budget: {(test_raw.budget == 0).sum()}")
print(f"Amount of movies (test) with 0 budget: {(test_raw.runtime == 0).sum()}")
corr_mat = train_raw.corr()
corr_mat.revenue.sort_values(ascending=False)
Better yet, visualize in a heatmap:
cols =['revenue','budget','popularity','runtime']
sns.heatmap(train_raw[cols].corr())
plt.show()
Let's plot the variables relations:
sns.pairplot(train_raw.select_dtypes('number').drop(columns='id'), kind="reg", diag_kind="kde")
plt.show()
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(30, 15), sharey=False)
train_na = train_raw.isna().sum().sort_values(ascending=False)
sns.barplot(train_na.values, train_na.index, orient='h', ax=axes[0])
axes[0].set_title('Train Null Values:')
test_na = test_raw.isna().sum().sort_values(ascending=False)
sns.barplot(test_na.values, test_na.index, orient='h', ax=axes[1])
axes[1].set_title('Test Null Values:')
plt.show()
belongs_to_collection - many movies aren't part of a collection. ("Logical" Null)
homepage - Not useful, we can drop it.
tagline - about 20% of movies don't have a tagline.
backdrop_path & poster_path - we will not use any image processing.
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(30, 15), sharey=False)
train_na = train_raw.isin(['[]', '{}', '']).sum().sort_values(ascending=False)
sns.barplot(train_na.values, train_na.index, orient='h', ax=axes[0])
axes[0].set_title('Train Empty Values:')
test_na = test_raw.isin(['[]', '{}', '']).sum().sort_values(ascending=False)
sns.barplot(test_na.values, test_na.index, orient='h', ax=axes[1])
axes[1].set_title('Test Empty Values:')
plt.show()
Everything below 10% of the data, and thus ignorable.
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(30, 15), sharey=False)
train_na = train_raw.eq(0).sum().sort_values(ascending=False)
sns.barplot(train_na.values, train_na.index, orient='h', ax=axes[0])
axes[0].set_title('Train Zero-Values:')
test_na = test_raw.eq(0).sum().sort_values(ascending=False)
sns.barplot(test_na.values, test_na.index, orient='h', ax=axes[1])
axes[1].set_title('Test Zero-Values:')
plt.show()
video - single valued and thus will be ignored.
budget & runtime - Will be imputated.
Following our data exploration, we won't use the following features:
backdrop_path - We won't use any image processing in this task.homepage - Mostly Nulls.poster_path - We won't use any image processing in this task.imdb_id - We won't use IMDb.video - Single valued, not useful.status - Mostly single valued (Released), not useful.from feature_engineering import *
train = drop_features(train_raw)
train_X, train_Y = train.drop('revenue', axis=1), train['revenue']
test = drop_features(test_raw)
test_X, test_Y = test.drop('revenue', axis=1), test['revenue']
First, we should handle with all nested collections attributes.
We used eval to convert the string representation of collection to an object.
Then, we've exploded (column-wise) selected attributes from each nested object.
The following attributes were added (mapped):
belongs_to_collection.id : If a movie belongs to a collection, then we keep collection id, else None.genres : Containing the genre name attributesproduction_comapnies.id & production_comapnies.origin_country : List of production companies id attribute & production companies origin country.production_countries : List of countries (iso_3166) where the movie was filmed.release_month, release_quarter, release_year : The month, quarter & year the film was released on.spoken_languages : List of spoken languages in a movie (iso_639 attribute).Keywords.id : List of id attribute for each Keyword.cast.id & cast.gender : List of id & gender (attributes) from cast member.crew.id & crew.department : List of id & department (attributes) from crew member.flattened_train = features_flattening(train_X)
flattened_train.head()
flattened_test = features_flattening(test_X)
flattened_test.head()
Now we can try to manipulate the features to extract more information:
collection_size : specify the amount of movies in same collection (within data).biggest_production_company_size : The size (film count) of production company with most productions, within the production companies of a film.most_companies_country_size : The size (companies count) of country with most productions companies, within the production companies countries of a film.most_productions_country_size : The size (film count) of a production county, with most productions, within the production countries of a film.cast.gender_ratio : The gender ratio (Males / Females + Males) of cast.spoken_lang_num : The amount of spoken languages in a film.overview_word_count : Amount of words (whitespaces) in overview.tagline_char_count : The length (characters) of a tagline.title_char_count : The length (characters) of a title.cast_size : Amount of cast-members in a film.crew_size : Amount of crew-members in a film.[Department Name]_depart_size : The size of department in a film.avg_runtime_by_year : Mean runtime of films in the released year of a movie.avg_budget_by_year : Mean budget of films in the released year of a movie.avg_popularity_by_year : Mean popularity of films in the released year of a movie.title_changed : Boolean indicating whether the original title is different from title.extracted_train = feature_extraction(flattened_train)
extracted_train.head()
extracted_test = feature_extraction(flattened_test)
extracted_test.head()
Our data contain many categorical features, we need to convert them to dummy variables before any learning can be preformed.
10,000 unique keywords in train data, we will use only the top 20 most-frequent keywords in train. This will provide high confidence that these keywords will be meaningful in validation\test data.70,000), we will be create a dummy variable from the top 10 most-frequent production companies in train data.All these dummy variables will be extracted from train data, and be hard coded to use in test preprocessing.
dummy_train = add_dummies_train(extracted_train)
print(f"Added {dummy_train.shape[1] - extracted_train.shape[1]} dummy variables to train!")
dummy_test = add_dummies_test(extracted_test)
print(f"Added {dummy_test.shape[1] - extracted_test.shape[1]} dummy variables to test!")
# Drop unneccesry fields:
tuple_fields = ['genres', 'spoken_languages', 'production_countries', 'production_companies.id', 'Keywords.id', 'cast.id', 'cast.gender', 'crew.id', 'crew.department', 'belongs_to_collection.id', 'production_companies.origin_country']
text_fields = ['original_language', 'original_title', 'overview', 'tagline', 'title']
dummy_train.drop(tuple_fields + text_fields, axis=1, inplace=True)
dummy_test.drop(tuple_fields + text_fields, axis=1, inplace=True)
We will use KNN (k = 5, Euclidean distance) Imputation to find budget & runtime for films with zero values.
imputated_train = missing_value_imputation(dummy_train)
imputated_test = missing_value_imputation(dummy_test)
imputated_train['budget'] = imputated_train.budget.transform(np.log1p)
imputated_test['budget'] = imputated_test.budget.transform(np.log1p)
imputated_train
imputated_test
We tried multiple models, while focusing on ensemble methods due to the amount of features.
Then, we used CV for hyperparameters tuning for two selected models:
Random Forest
100, 200, 500, 1000, 1500, 2000.mean sqaured error or mean absolute error.N, Sqaure-root(N), log2(N), 0.2 * N, 0.4 * N, 0.6 * N, 0.8 * N10, 20, 30, ..., 200.2, 5, 10 ,20.1, 2, 4, 8, 16.True or False.XGBoost
100, 200, 500, 1000, 1500.Linear or Squared Error.0.01, 0.03, 0.05, .07.4, 5, 6, 7, 8, 9, 10, 20.0, 0.3, 0.4, 0.5, 1, 5.1, 4, 5.0, 1, 5, 10.0, 1, 2.Finally, the following models were chosen:
from sklearn.metrics import mean_squared_log_error
def train_model(train_X, train_Y, model):
model.fit(train_X, np.log1p(train_Y))
with open(f"models/{model.__class__.__name__}.pkl", 'wb') as f:
pickle.dump(model, f)
def evaluate_model(test_X, test_Y, model):
pred = np.expm1(model.predict(test_X))
rmsle = np.sqrt(mean_squared_log_error(test_Y, pred))
print(f"RMSLE for Test: {rmsle:.6f}")
# XGBoost Model:
import xgboost as xgb
xg_params = { 'subsample': 0.6,
'reg_lambda': 10,
'reg_alpha': 2,
'objective': 'reg:squarederror',
'n_estimators': 1000,
'min_child_weight': 4,
'max_depth': 7,
'learning_rate': 0.01,
'gamma': 0.5,
'colsample_bytree': 0.6 }
xgb_model = xgb.XGBRegressor(**xg_params, n_jobs= -1)
train_model(imputated_train, train_Y, xgb_model)
evaluate_model(imputated_test, test_Y, xgb_model)
# Random Forest Model:
from sklearn.ensemble import RandomForestRegressor
rf_params = { 'n_estimators': 1500,
'min_samples_split': 2,
'min_samples_leaf': 2,
'max_features': 0.4,
'max_depth': 50,
'criterion': 'mae',
'bootstrap': False }
rf_model = RandomForestRegressor(**rf_params, n_jobs= -1)
train_model(imputated_train, train_Y, rf_model)
evaluate_model(imputated_test, test_Y, rf_model)